#19-704 Assignment 2
#This code was often based on the 19-704 Week 2 Lecture Notes.

##############################################################
#Load Data and Packages
##############################################################

#install.packages("AER")
#install.packages("quantreg")
#Initialize data
library(AER)
library(quantreg)
data(CPS1988)
log.rwage <- log(CPS1988$wage) #log of weekly wages
ed <- CPS1988$education #years of schooling
exper <- CPS1988$experience #years of potentional work experience
race <-ifelse(CPS1988$ethnicity == 'cauc',0,1) #race dumm

#############################################################
#look at the data
#############################################################
#fix(CPS1988)
names(CPS1988)
objects(CPS1988)


#############################################################
#Replication Section
#1. Conduct the median (LAD) regressions. (1 point)
#2. Compare your results to those reported in the paper. (1 point)
#3. Briefly explain what you think the regression summaries mean (give it your best)
#############################################################

#Mincer Regression
mincer.reg.med = rq(log.rwage~ed + exper + I(exper^2) + race, 0.5)
summary(mincer.reg.med)		

#Quartic Regression
quartic.reg.med <- rq(log.rwage~ed + exper + 
									I(exper^2) + I(exper^3) + I(exper^4) +
									race, 0.5)
summary(quartic.reg.med)


#The residuals are included here for later comparison with question 5.  
png(filename = "hmwk2ResidMed.png", width = 3000, height = 3000, res = 300)
par(mfrow = c(2,1),cex = 1.3,
		mar = c(4,4,1,1),#mar and oma set the inner and out margins
		oma = c(1,1,1,1))

#plot(log.rwage, residuals(mincer.reg.med))
#plot(log.rwage, residuals(quartic.reg.med))
plot(fitted(mincer.reg.med),resid(mincer.reg.med))
plot(fitted(quartic.reg.med),resid(quartic.reg.med))

dev.off()

#1.Done.
#2. 
#3. 


#############################################################
#Extension Questions...
#############################################################



#############################################################
#Q1.Create histograms of the wages, log wages, education,
#and experience variables, with a summary of what you're seeing.
#############################################################


#Create Histogram plot
png(filename = "hmwk2histogram.png", width = 3000, #adjust png width
		height = 3000, #height
		res = 300)

par(mfrow = c(4,1), #sets number of rows and columns
		cex = 1.3, #scales text and symbols
		mar = c(4,3,1,0),#mar and oma set the inner and out margins
		oma = c(0,0,0,0))

hist(CPS1988$wage,
		 main = "Weekly Wages",
		 breaks = "FD",
		 xlab = "")

hist(log.rwage ,
		 main = "Log of Weekly Wages", breaks = "FD", xlab = "")

hist(ed, 
		 main = "Years of Schooling", breaks = "FD",
		 xlab = "")

hist(exper, 
		 main = "Years of Potential Work Experience",
		 breaks = "FD", xlab = "")
dev.off()



#############################################################
#Q2. Create scatterplots of wages and log wages against education and experience,
#with fitted linear or median regression lines, again with a summary.
#############################################################

#linear regression
reg.wage.ed = lm(CPS1988$wage~ed)
reg.wage.exp = lm(CPS1988$wage~exper)
reg.lwage.ed = lm(log.rwage~ed)
reg.lwage.exp = lm(log.rwage~exper)

#median regression
reg.wage.ed.med = rq(CPS1988$wage~ed, 0.5)
reg.wage.exp.med = rq(CPS1988$wage~exper,0.5)
reg.lwage.ed.med = rq(log.rwage~ed,0.5)
reg.lwage.exp.med = rq(log.rwage~exper,0.5)


png(filename = "hmwk2scatterplots.png", width = 3000, #adjust png width
		height = 4000, #height
		res = 300)

par(mfrow = c(4,1), #sets number of rows and columns
		cex = 1.3, #scales text and symbols
		mar = c(4,4,1,1),#mar and oma set the inner and out margins
		oma = c(1,1,1,1))
plot(ed,CPS1988$wage,
		 xlab = "Years of Education (yr)",
		 ylab = "Weekly Wages ($)")
abline(reg.wage.ed, col= 'black')
abline(reg.wage.ed.med, col = 'blue')

plot(exper, CPS1988$wage,
		 xlab = "Years of Potential Experience (yr)",
		 ylab = "Weekly Wages ($)")
abline(reg.wage.exp, col= 'black')
abline(reg.wage.exp.med, col = 'blue')

plot(ed,log.rwage,
		 xlab = "Years of Education (yr)",
		 ylab = "Log of Weekly Wages($)")
abline(reg.lwage.ed, col= 'black')
abline(reg.lwage.ed.med, col = 'blue')

plot(exper, log.rwage,
		 xlab = "Years of Potential Experience",
		 ylab = "Log of Weekly Wages($)")
abline(reg.lwage.exp, col= 'black')
abline(reg.lwage.exp.med, col = 'blue')
dev.off()



#############################################################
#Q.3:  Compare wages and log wages to the normal distribution,
#with a summary of what you think is going on.
#############################################################
#approach: Q-Q plots will be used to compare the distributions


# 10,000 draws from the normal distribution
set.seed(7)
norm <- rnorm(10000, mean(CPS1988$wage), sd(CPS1988$wage))
lognorm <-rnorm(10000, mean(log.rwage), sd(log.rwage))

png(filename = "hmwk2QQ.png", width = 3000, height = 3000, res = 300)
par(mfrow = c(2,1),cex = 1.3,
		mar = c(4,4,1,1),#mar and oma set the inner and out margins
		oma = c(1,1,1,1))
k1 = min(length(CPS1988$wage),length(norm))
k2 = min(length(log.rwage),length(norm))

#create a sequence of quantiles
wages.q <- quantile(CPS1988$wage, probs = 
											(seq(1, k1, 1) - .5)/k1, type = 4)

# create a sequence of quantiles
logwages.q <- quantile(log.rwage, probs = 
											 	(seq(1, k2, 1) - .5)/k2, type = 4) 
normal.q <- quantile(norm,probs = (seq(1, k1, 1) - .5)/k1, type = 4)
lognormal.q <- quantile(lognorm,probs = (seq(1, k2, 1) - .5)/k2, type = 4)

min.q1 <- floor(min(wages.q,normal.q))
max.q1 <- ceiling(max(wages.q,normal.q))
plot(normal.q, wages.q,
		 xlim = c(min.q1, max.q1),
		 ylim = c(min.q1, max.q1),
		 ylab = "Quantiles of Wages",
		 xlab = "Quantiles of Normal Distribution",
		 pch = 19,
		 col = rgb(0, 0, 0, 0.4))
abline(0, 1)

min.q2 <- floor(min(logwages.q,lognormal.q))
max.q2 <- ceiling(max(logwages.q,lognormal.q))
plot(lognormal.q, logwages.q,
		 xlim = c(min.q2, max.q2),
		 ylim = c(min.q2, max.q2),
		 ylab = "Quantiles of Log Wages",
		 xlab = "Quantiles of Normal Distribution",
		 pch = 19,
		 col = rgb(0, 0, 0, 0.4))
abline(0, 1)
dev.off()

 

################################################################
#Q.4: Plot the regression residuals for the regressions in Tables 1.A and 2.A,
#using ordinary linear regression rather than median regression.
#Are there outliers? If so, what type are they? Would median regression
#or ordinary linear regression be more appropriate for these data?
#############################################################


#Linear Regression
#Mincer Regression
mincer.reg.lin = lm(log.rwage~ed + exper + I(exper^2) + race)
summary(mincer.reg.lin)		

#Quartic Regression
quartic.reg.lin <- lm(log.rwage~ed + exper + 
										I(exper^2) + I(exper^3) + I(exper^4) +
										race)
summary(quartic.reg.lin)


png(filename = "hmwk2ResidLin.png", width = 3000, height = 3000, res = 300)
par(mfrow = c(2,1),cex = 1.3,
		mar = c(4,4,4,4),#mar and oma set the inner and out margins
		oma = c(1,1,1,1))

#this doesn't work
#plot(log.rwage, resid(mincer.reg.lin))
#plot(log.rwage, resid(quartic.reg.lin))
#this does
plot(fitted(mincer.reg.lin),resid(mincer.reg.lin),
		 main = "Linear Model",
		 xlab = "Wage-Fitted Values ($)",
		 ylab = "Residual")
plot(fitted(quartic.reg.lin),resid(quartic.reg.lin),
		 main = "Quartic Model",
		 xlab = "Wage-Fitted Values ($)",
		 ylab = "Residual")
		 

dev.off()

png(filename = "hmwk2mincerinfluenceindex.png", width = 3000, height = 3000, res = 300)
# Influence index plot and identify the largest three observations
influenceIndexPlot(mincer.reg.lin, id.n = 3)
dev.off()

png(filename = "hmwk2quarticinfluenceindex.png", width = 3000, height = 3000, res = 300)
# Influence index plot and identify the largest three observations
influenceIndexPlot(quartic.reg.lin, id.n = 3)
dev.off()


# #install.packages("car", repos = "http://lib.stat.cmu.edu/R/CRAN/")
# library(car)
# png(filename = "hmwk2qqreg.png", width = 3000, height = 3000, res = 300)
# #par(mfrow = c(1, 2), cex = 1.3, mar = c(5, 4, 4, 1))
# # qqplot of the regression residuals from the real regression
# # id.n = 3 identifies the three largest residuals
# qqPlot(mincer.reg.lin,
# 			 main = "Observed Data",
# 			 id.n = 3,
# 			 pch = 19,
# 			 col = rgb(0, 0, 0, .5))
# abline(0, 1)
# dev.off()



################################################################
#Q.5: Using median or ordinary linear regression, evaluate the
#backcasting quality of the Mincer type model (Table 1.A).
#Use 5-fold cross-validation to test whether this model is too complex.
################################################################

#backcast
png(filename = "hmwk2backcast.png", width = 3000, height = 3000, res = 300)
par(cex = 1.3)
# Plot the predicted happiness scores versus the observed happiness scores


plot(predict(mincer.reg.lin), log.rwage,
		 xlim = c(min(predict(mincer.reg.lin)), max(predict(mincer.reg.lin))),
		 ylim = c(min(log.rwage), max(log.rwage)),
		 ylab = "Actual Log of the Wages",
		 xlab = "Predicted Log of the Wages",
		 pch = 19,
		 col = rgb(0, 0, 0, .3))
abline(0, 1)
dev.off()

#forecast

# Sample 4/5 of the transformed "log of the wage" data
# Sample without replacement and use it as training data

#add cell id to CPS1988
CPS1988$id <- 1:length(CPS1988$wage)
#fix(CPS1988)#check

set.seed(10)
rMSEavg1 = c()
rMSEavg2 = c()
rMSEavg3 = c()
rMSEavg4 = c()
rMSEavg5 = c()
rMSEavg6 = c()

for(i in 1:1000){ # Loop i from 1 to 1000

	train.id <- sample(CPS1988$id, # Sample the cell ids
									4*length(CPS1988$id)/5, # Sample 2/3 of the data
									replace = FALSE) # Sample without replacement
	# Create "test" data that are the cell ids not selected
	test.id <- CPS1988$id[ - train.id]
	
	#run linear regressions on the training and test data
	#for the full model and less complex models
	
	#TRAINING 
	#full regression
	train1 = lm(log(wage)~education + experience
											+ I(experience^2) + ethnicity,
											data = CPS1988[CPS1988$id %in% train.id,])
	
	#subtract quadratic term
	train2 = lm(log(wage)~education + experience + ethnicity,
											 data = CPS1988[CPS1988$id %in% train.id,])
	
	#only education
	train3 = lm(log(wage)~education,
											 data = CPS1988[CPS1988$id %in% train.id,])
	
	#only experience
	train4 = lm(log(wage)~experience,
											 data = CPS1988[CPS1988$id %in% train.id,])
	
	#only race
	train5 = lm(log(wage)~ethnicity,
											 data = CPS1988[CPS1988$id %in% train.id,])
	
	#only constant
	train6 = lm(log(wage)~1,
											 data = CPS1988[CPS1988$id %in% train.id,])
	
	#TESTING
	test1 <- (log(CPS1988$wage[CPS1988$id %in% test.id])-
							predict(train1, CPS1988[CPS1988$id %in% test.id,]))^2
	
	test2 <- (log(CPS1988$wage[CPS1988$id %in% test.id])-
							predict(train2, CPS1988[CPS1988$id %in% test.id,]))^2
	
	test3 <- (log(CPS1988$wage[CPS1988$id %in% test.id])-
							predict(train3, CPS1988[CPS1988$id %in% test.id,]))^2
	
	test4 <- (log(CPS1988$wage[CPS1988$id %in% test.id])-
							predict(train4, CPS1988[CPS1988$id %in% test.id,]))^2
	
	test5 <- (log(CPS1988$wage[CPS1988$id %in% test.id])-
							predict(train5, CPS1988[CPS1988$id %in% test.id,]))^2
	
	test6 <- (log(CPS1988$wage[CPS1988$id %in% test.id])-
							predict(train6, CPS1988[CPS1988$id %in% test.id,]))^2
	
	rMSEtest1 <- sqrt(sum(test1)/length(test1))
	rMSEtest2 <- sqrt(sum(test2)/length(test2))
	rMSEtest3 <- sqrt(sum(test3)/length(test3))
	rMSEtest4 <- sqrt(sum(test4)/length(test4))
	rMSEtest5 <- sqrt(sum(test5)/length(test5))
	rMSEtest6 <- sqrt(sum(test6)/length(test6))
	
	
	rMSEavg1 <- append(rMSEavg1, rMSEtest1)
	rMSEavg2 <- append(rMSEavg2, rMSEtest2)
	rMSEavg3 <- append(rMSEavg3, rMSEtest3)
	rMSEavg4 <- append(rMSEavg4, rMSEtest4)
	rMSEavg5 <- append(rMSEavg5, rMSEtest5)
	rMSEavg6 <- append(rMSEavg6, rMSEtest6)
}

summary(rMSEavg1)
summary(rMSEavg2)
summary(rMSEavg3)
summary(rMSEavg4)
summary(rMSEavg5)
summary(rMSEavg6)

#Summary: The rMSE increases as the complexity decreases.
#Should more be done for the backcasting??


################################################################
#Q6. Discuss the statistical inference story. Explain why using
#confidence intervals would be valid or invalid for these data. 
################################################################


################################################################
#7. Discuss the causality story. To what degree do you think we
#can make causal claims about the regression models used?
#Explain your reasoning. 
################################################################
#In an ideal world, random assignment allows researchers
#to make a claim about causality.  Unfortunately, it is difficult
#to make an experiment truly random, especially with people.
#In this experiment, it is not truly random because it is a survey.
#Some recipiments of the survey do not respond.  Another possiblity
#is that the survey is biased against some members of the population.
#For example, if a person lives without access the internet, a telephone,
#or a mailing address, they might be missed by the survey.

#how is a survey, which just collects a bunch of data 
#different than a control/experimental group.  

#Volunteerism and attrition

################################################################
#Q8. (Optional) Extend the data summary and/or conditional
#distribution stories in novel ways, either using methods
#covered in the lecture notes, or methods that you've
#developed yourself. (1-2 bonus points)
################################################################
